##################################
#JOP REPLICATION CODE
##################################

#Table 1 - derived from 2018, 2020, and 2022 cycles of "JOP Paper Master Data" (incl. in zip file).

#Table 2 - see Codebook. Baseline data derived from Bonica (2017) DIME dataset. All observations pertaining to candidates that competed solely in primary elections (i.e. non-party nominees) removed from dataset by hand - no corresponding code below. 

library("stargazer")
library("tidyverse")
library("ggthemes")
library("plyr")
library("readxl")

dime <- read_csv("Original DIME Data.csv")

df_filtered <- dime %>%
  filter(cycle %in% c(2018, 2020, 2022), seat %in% c("federal:house", "federal:senate"))

# Since dataset does not include nominee status, checked each party's nominee for the race-cycle observation on Ballotpedia and removed all non-nominees manually. 8281 obs. -> 2536 obs. 
# 1-2 obs. per race (in some cases, opposition party did not field a candidate + "pwinner" variable insufficient since many primary elections cancelled due to single candidate) 

# Final version with hand-coded columns added (see codebook/outline)
# Unique identifiers (bonica.rid) retained in order to cross-check and verify manual changes to dataset (see codebook/outline)

Data <- read_excel("C:/Users/Gabe/Desktop/JOP_Paper_Regression_Analysis.xlsx")

#Regression No. 1 - DV = party_redbox

lm1 <- lm(party_redbox ~ competitive_broad + abs_cf + quality_candidate + party_target + party_spend + incumbent + democrat, data = Data)

#Regression No.2 - DV = candidate_redbox

lm2 <- lm(candidate_redbox ~ competitive_broad + abs_cf + quality_candidate + party_target + party_spend + incumbent + democrat, data = Data)

#Convert to Latex 

latex <- stargazer(lm1, lm2, title = "Results: Party & Candidate Redboxing", column.labels = "Party Redbox", "Candidate Redbox", covariate.labels = "Competitive General", "Ideological Extremism", "Prior Elected Office", "Party Target", "Significant Party Expenditure", "Incumbent", "Democrat", model.names = T)

#Table 3 and 4 - derived from cosine similarity analysis based on "Similarity Test" (incl. in zip file). "Similarity Test" derived from transcribing each individual WMP TV ad using a commercial transcription software (Descript). Each relevant candidate-cycle observation was inserted and checked individually for "matches" with outside group ads with a coefficient greater than or equal to 0.5 using the following code. Matches were then coded and connected with a sum expended on the ad(s) in the "House Advertising Data 2020 Cycle" and "Senate Advertising Data 2020 Cycle" and summed by hand for relevant groups (e.g., party-affiliated super PACs).

library("quanteda") 
library("quanteda.textstats") 
library("quanteda.textmodels")
library("glmnet") 
library("topicmodels")
library("RedditExtractoR")
library("tidyverse")
library("dplyr")
library("readxl")

Similarity_Test <- read_excel("C:/Users/Gabe/Desktop/Similarity Test.xlsx")

#filter for candidate and cycle - update and re-run for each candidate in the dataset

Similarity_rev <- Similarity_Test %>%
  filter(candidate == "Alyse Galvin", cycle == 2020)

#convert to corpus form and clean texts (remove numbers, punctuation, stopwords)

SpeechCorpus <- corpus(Similarity_rev$text, docvars = Similarity_rev)

SpeechCorpus <- SpeechCorpus %>%
  tokens(remove_numbers = T, remove_punct = T, include_docvars = T) %>%
  tokens_remove(stopwords("en")) %>%
  tokens_wordstem() %>%
  dfm() %>%
  dfm(tolower = T)

SpeechCorpus <- dfm_trim(SpeechCorpus,min_docfreq = 2)

#create cosine similarity coefficient matrix

text_simil <- textstat_simil(SpeechCorpus, margin = "documents", method = "cosine")
text_simil

#inspect individual term matrices

matrix <- convert(SpeechCorpus, to = "data.frame")
